# Example of reparameterizing a model to yield cell means quinn <- read.csv('ecol 563/quinn1.csv') quinn$Density <- factor(quinn$Density) # effects model out1 <- lm(Eggs~Density*Season, data=quinn) summary(out1) # cell means model out2 <- lm(Eggs~Density:Season-1, data=quinn) # models have same AIC AIC(out1, out2) # cell means summary(out2) # confidence intervals for means confint(out2) # binary data example parasites <- read.table('ecol 563/Parasite.txt',header=T) parasites[1:8,] # fit model with all predictors out1 <- glm(infection~age+weight+sex, data=parasites, family=binomial) summary(out1) # Wald test says age is not significant, compare with LR test out2 <- glm(infection~weight+sex, data=parasites, family=binomial) anova(out2,out1,test='Chisq') # odds ratios from model exp(coef(out1)[2:4]) # default plot of response is a mosaic plot plot(infection~weight, data=parasites) # plot binary data plot(as.numeric(infection)-1~weight, data=parasites) # jitter the data and color the observations by sex plot(jitter(as.numeric(infection))-1 ~ jitter(weight), data=parasites, col=as.numeric(sex)) # decrease the jitter plot(jitter(as.numeric(infection), a=.05)-1 ~ jitter(weight, a=.1), data=parasites, col=as.numeric(sex), xlab='weight', ylab='Probability') # model with weight and sex coef(out2) # linear predictor eta <- function(x,sex) coef(out2)[1] + coef(out2)[2]*x + coef(out2)[3]*(sex=='male') eta(1,'female') eta(1,'male') # inverse logit inv.logit <- function(eta) exp(eta)/(1+exp(eta)) # add predicted probabilities to graph plot(jitter(as.numeric(infection),a=.05)-1 ~ jitter(weight,a=.1), data=parasites, col=as.numeric(sex), xlab='weight', ylab='Probability') curve(inv.logit(eta(x,'female')), add=T, lty=2) curve(inv.logit(eta(x,'male')), add=T, lty=2, col=2) # assess functional forms of predictors with a GAM library(mgcv) out1.gam <- gam(infection ~ sex + s(weight)+s(age), data=parasites, family=binomial) # plot estimated smmooths plot(out1.gam) # plot smooth for age plot(out1.gam, select=2) abline(h=0, lty=2, col=2) # plot smooth for weight plot(out1.gam, select=1) abline(h=0, lty=2, col=2) # add a quadratic term in age out3 <- glm(formula = infection ~ age +I(age^2) + weight + sex, family = binomial, data=parasites) # LR test for quadatic age anova(out1, out3, test='Chisq') summary(out3) # re-examine smooth for weight with model containing quadratic age out1.gam <- gam(infection ~ sex + s(weight) + age + I(age^2), data=parasites, family=binomial) plot(out1.gam) # fit breakpoint model for weight with a break point of 8 out4 <- glm(formula = infection ~ age + I(age^2) + sex + I((weight>8)*(weight-8)), family = binomial, data=parasites) # this is an improvement AIC(out3, out4) summary(out4) # possible choices for break points sort(unique(parasites$weight)) cvec <- 3:16 # function to fit model at specific break points c.func <- function(c) { mymodel <- glm(infection ~ sex + age + I(age^2) + I((weight>=c)*(weight-c)), data=parasites, family=binomial) c(c, logLik(mymodel), AIC(mymodel))} # run function sapply(cvec, c.func) # best break point is c = 12 out6 <- glm(infection ~ sex + age + I(age^2) + I((weight>=12)*(weight-12)), data=parasites, family=binomial) summary(out6) # compare models AIC(out1, out3, out6) # comparison is unfair because we're not counting break point as a parameter my.aic <- AIC(out1, out3, out6) # add to AIC my.aic[,"AIC"] <- my.aic[,"AIC"]+c(0,0,2) # add parameter my.aic[,"df"] <- my.aic[,"df"]+c(0,0,1) #### model calibration ########### # predicted probabilities round(fitted(out6),3) # decision rule: classify as infected if p > 0.5 # confusion matrix tab0 <- table(fitted(out6)>0.5, parasites$infection) tab0 # calculate specificity and sensitivity diag(tab0)/apply(tab0,2,sum) # function to fit breakpoint model precision <- function(c,model, response) { tab0 <- table(fitted(model)>=c, response) rownames(tab0)<-c('-','+') out <- diag(tab0)/apply(tab0, 2, sum) names(out) <- c('specificity', 'sensitivity') list(tab0, out) } precision(.5, out6, parasites$infection) # obtaining specificity and sensitivity library(ROCR) # create prediction object pred1 <- prediction(fitted(out6), parasites$infection) # extract true positive and true negative rates stats1 <- performance(pred1, 'tpr', 'tnr') # the result is an S4 object str(stats1) stats1@y.values stats1@x.values stats1@alpha.values # change infinite value to 1 stats1@alpha.values[[1]][1]<-1 # plot specificity plot(stats1@alpha.values[[1]], stats1@x.values[[1]], type='s', xlab='cut-off', ylab='Classification rate') # add sensitivity lines(stats1@alpha.values[[1]], stats1@y.values[[1]], type='s', col=2) legend('right', c('specificity', 'sensitivity'), col=1:2, lty=1, bty='n', cex=.9) # determinine minimized difference threshold (MDT) which.min(abs(stats1@x.values[[1]]-stats1@y.values[[1]])) stats1@alpha.values[[1]][26] precision(stats1@alpha.values[[1]][26], out6,parasites$infection) abline(v=stats1@alpha.values[[1]][26], lty=2, col=4) # determinine maximized sum threshold (MST) which.max(stats1@x.values[[1]]+stats1@y.values[[1]]) stats1@alpha.values[[1]][37] abline(v=stats1@alpha.values[[1]][37], col=3, lty=2) ############ ROC curves ########## stats1a <- performance(pred1,'tpr','fpr') str(stats1a) plot(stats1a@x.values[[1]], stats1a@y.values[[1]], type='s', col='grey70', lwd=2, xlab='FPR', ylab='TPR') # obtain TPR and FPR of a second model pred2 <- prediction(fitted(out1), parasites$infection) stats2a <- performance(pred2,'tpr','fpr') # add ROC curve to previous graph lines(stats2a@x.values[[1]], stats2a@y.values[[1]], type='s', col=2) ##### Area under the curve ###### stats1b <- performance(pred1, 'auc') stats2b <- performance(pred2, 'auc') str(stats1b) stats1b@y.values stats2b@y.values #### 10-fold cross-validation of binary model ###### library(DAAG) cv.binary(out6) cv.binary(out1)
Created by Pretty R at inside-R.org